library(ggplot2)
library(plyr)
library(dplyr)
Attaching package: <U+393C><U+3E31>dplyr<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:plyr<U+393C><U+3E32>:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following objects are masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
filter, lag
The following objects are masked from <U+393C><U+3E31>package:base<U+393C><U+3E32>:
intersect, setdiff, setequal, union
library(corrplot)
corrplot 0.84 loaded
library(caret)
Loading required package: lattice
library(gridExtra)
Attaching package: <U+393C><U+3E31>gridExtra<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:dplyr<U+393C><U+3E32>:
combine
library(scales)
library(Rmisc)
library(ggrepel)
library(psych)
Attaching package: <U+393C><U+3E31>psych<U+393C><U+3E32>
The following objects are masked from <U+393C><U+3E31>package:scales<U+393C><U+3E32>:
alpha, rescale
The following objects are masked from <U+393C><U+3E31>package:ggplot2<U+393C><U+3E32>:
%+%, alpha
################################# Reading in data and combining training and test data ########################################
#read files
train=read.csv("./inputs/train.csv", stringsAsFactors=FALSE)
test=read.csv("./inputs/test.csv", stringsAsFactors=FALSE)
#check read in files
print(train)
print(test)
#remove id from both train and test variables
train=train[,-1]
print(train)
test=test[,-1]
print(test)
#combine train and test sets for analysis and cleaning
#test set has missing saleprice (uneqal dimension for merging with train set)
#make a copy of the test set before adding the extra column
test_tmp=test
test_tmp$SalePrice= NA
print(test_tmp)
#combine the added-column test set with the train set
all=rbind(train,test_tmp)
print(all)
#finding the number of numeric codes
numericVars=which(sapply(all, is.numeric))
numericVarNames=names(numericVars)
sprintf("There are %s numeric variables", length(numericVars))
[1] "There are 37 numeric variables"
######################################### Tabulate the number of missing values ####################################################
#tabulate the number of missing values
miss_freq=sort(colSums(is.na(all)),decreasing=TRUE)
print(miss_freq)
PoolQC MiscFeature Alley Fence SalePrice FireplaceQu LotFrontage
2909 2814 2721 2348 1459 1420 486
GarageYrBlt GarageFinish GarageQual GarageCond GarageType BsmtCond BsmtExposure
159 159 159 159 157 82 82
BsmtQual BsmtFinType2 BsmtFinType1 MasVnrType MasVnrArea MSZoning Utilities
81 80 79 24 23 4 2
BsmtFullBath BsmtHalfBath Functional Exterior1st Exterior2nd BsmtFinSF1 BsmtFinSF2
2 2 2 1 1 1 1
BsmtUnfSF TotalBsmtSF Electrical KitchenQual GarageCars GarageArea SaleType
1 1 1 1 1 1 1
MSSubClass LotArea Street LotShape LandContour LotConfig LandSlope
0 0 0 0 0 0 0
Neighborhood Condition1 Condition2 BldgType HouseStyle OverallQual OverallCond
0 0 0 0 0 0 0
YearBuilt YearRemodAdd RoofStyle RoofMatl ExterQual ExterCond Foundation
0 0 0 0 0 0 0
Heating HeatingQC CentralAir X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
0 0 0 0 0 0 0
FullBath HalfBath BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces PavedDrive
0 0 0 0 0 0 0
WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
0 0 0 0 0 0 0
MoSold YrSold SaleCondition
0 0 0
sprintf("There are %s columns of missing values", sum(miss_freq>0))
[1] "There are 35 columns of missing values"
######################################### To fix all 34 variables with missing values ################################################
PoolQC: Pool quality
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
NA No Pool#replace missing values in PoolQC
#According to the data description, PoolQC has Excellent, Good, Average, Fair and No Pool levels. NA represents No Pool.
all$PoolQC[is.na(all$PoolQC)]='No Pool'
#convert the variable to ordinal class
values=c("No Pool"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$PoolQC=as.integer(revalue(all$PoolQC,values))
The following `from` values were not present in `x`: Po, TA
class(all$PoolQC)
[1] "integer"
#assessing PoolQC with PoolArea
all[all$PoolQC==0 & all$PoolArea>0, c("PoolQC","PoolArea","OverallQual")]
#this indicates some error in filling up of the missing values. '0' indicates no pool yet another variable showing its pool area.
#here we assume pools exist for these houses but without quality ratings.
#we will replace these values with reference to "OverallQual" variable
#there are 10 ordinal categories in "OverallQual" and we resampled it into 5 categories to match with PoolQC range.
all[2421,"PoolQC"]=2
all[2504,"PoolQC"]=3
all[2600,"PoolQC"]=2
MiscFeature: Miscellaneous feature not covered in other categories
Elev Elevator
Gar2 2nd Garage (if not described in garage section)
Othr Other
Shed Shed (over 100 SF)
TenC Tennis Court
NA None#Replace missing values in MiscFeature
#MiscFeature account for the additional facilities that a house might possess, such as elevator, 2nd Gararge, Shed, Tennis Court and Others.
#The NAs are due to the house having no other additional facilities
#replacing 'NA' with 'None' and this feature should bein factor form.
all$MiscFeature[is.na(all$MiscFeature)]='None'
#convert the variable to factor class
all$MiscFeature=as.factor(all$MiscFeature)
class(all$MiscFeature)
[1] "factor"
Alley: Type of alley access to property
Grvl Gravel
Pave Paved
NA No alley access#NA represent no alley access
all$Alley[is.na(all$Alley)]='No alley access'
#convert the variable to factor class
all$Alley=as.factor(all$Alley)
class(all$Alley)
[1] "factor"
Fence: Fence quality
GdPrv Good Privacy
MnPrv Minimum Privacy
GdWo Good Wood
MnWw Minimum Wood/Wire
NA No Fence#NA represents No Fence
all$Fence[is.na(all$Fence)]='No Fence'
#converting this to factor class
all$Fence=as.factor(all$Fence)
class(all$Fence)
[1] "factor"
FireplaceQu: Fireplace quality
Ex Excellent - Exceptional Masonry Fireplace
Gd Good - Masonry Fireplace in main level
TA Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
Fa Fair - Prefabricated Fireplace in basement
Po Poor - Ben Franklin Stove
NA No Fireplace#replace NA with no Fireplace
all$FireplaceQu[is.na(all$FireplaceQu)]="No Fireplace"
#another variable that is closely related to FireplaceQu is Fireplaces
#and let's check whether the number of houses with no fireplace in "Fireplaces" equates with the count in "FireplaceQu"
sprintf("The number of No Fireplace in variables FireplaceQu and Fireplaces are equal: %s",sum(all$FireplaceQu=="No Fireplace") == sum(all$Fireplaces==0))
[1] "The number of No Fireplace in variables FireplaceQu and Fireplaces are equal: TRUE"
#convert the variable to ordinal class
values=c("No Fireplace"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$FireplaceQu=as.integer(revalue(all$FireplaceQu,values))
class(all$FireplaceQu)
[1] "integer"
LotFrontage: Linear feet of street connected to property
Neighborhood: Physical locations within Ames city limits
Blmngtn Bloomington Heights
Blueste Bluestem
BrDale Briardale
BrkSide Brookside
ClearCr Clear Creek
CollgCr College Creek
Crawfor Crawford
Edwards Edwards
Gilbert Gilbert
IDOTRR Iowa DOT and Rail Road
MeadowV Meadow Village
Mitchel Mitchell
Names North Ames
NoRidge Northridge
NPkVill Northpark Villa
NridgHt Northridge Heights
NWAmes Northwest Ames
OldTown Old Town
SWISU South & West of Iowa State University
Sawyer Sawyer
SawyerW Sawyer West
Somerst Somerset
StoneBr Stone Brook
Timber Timberland
Veenker Veenker#get the index for the rows with missing values
Lot_NAs=all[is.na(all$LotFrontage),c("Neighborhood","LotFrontage")]
print(Lot_NAs)
idx=as.numeric(rownames(Lot_NAs))
print(class(idx))
[1] "numeric"
print(length(idx))
[1] 486
# #we will use the mean distance of each "Neighbourhood" to replace the missing values in "LotFrontage"
for (i in idx){
if(is.na(all$LotFrontage[i])){
all[i,"LotFrontage"]=as.integer(mean(all$LotFrontage[all$Neighborhood==all[i,"Neighborhood"]], na.rm=TRUE))
}
}
print(all[c(8,13,15,17),"LotFrontage"])
[1] 81 74 75 75
print(class(all$LotFrontage))
[1] "integer"
#associate houses with missing values in this column as houses that did not rebuild or remodel its garage.
#replace this missing values with the year the house is built.
all[is.na(all$GarageYrBlt),"GarageYrBlt"]=all$YearBuilt[is.na(all$GarageYrBlt)]
class(all$GarageYrBlt)
[1] "integer"
#check whether the values have been replaced
sum(is.na(all$GarageYrBlt))
[1] 0
Putting all variables related to garage together:
Number of missing values in garag-related components:
GarageFinish: Interior finish of the garage = 159 GarageQual: Garage quality = 159 GarageCond: Garage condition = 159 GarageType: Garage location = 157 GarageCars: Size of garage in car capacity = 1 GarageArea: Size of garage in square feet = 1
#GarageFinish, GarageQual, GarageCond, GarageType, GarageCars and GarageArea seems to be inter-related.
tmp=all[is.na(all$GarageFinish) & is.na(all$GarageQual) & is.na(all$GarageCond) & is.na(all$GarageType),]
sprintf("how many of these rows have NA values in GarageFinish, Qual, Cond, and Type: %s", dim(tmp)[1])
[1] "how many of these rows have NA values in GarageFinish, Qual, Cond, and Type: 157"
#the above result indicates the the 157 missing values found in GarageType is also found in GarageFinish, Quala and Cond
#
all[is.na(all$GarageFinish) & is.na(all$GarageQual) & is.na(all$GarageCond) & !is.na(all$GarageType), c("GarageFinish","GarageQual","GarageCond","GarageType","GarageCars","GarageArea")]
#all[is.na(all$GarageCars) | is.na(all$GarageArea), c("GarageFinish","GarageQual","GarageCond","GarageType","GarageCars","GarageArea")]
# the record 2127 seems to have a garage as it has value for GarageCars and GarageArea
# on the other hand, 2577 does not seem to have a garage
# lets fix these two entries first
# function: finding mode
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# entry 2127
all[2127,"GarageFinish"]=getmode(all$GarageFinish)
all[2127,"GarageQual"]=getmode(all$GarageQual)
all[2127,"GarageCond"]=getmode(all$GarageCond)
# entry 2577
all[2577,"GarageType"]=NA
all[2577,"GarageCars"]=0
all[2577,"GarageArea"]=0
#check the rectification
print(all[c(2127,2577),c("GarageFinish","GarageQual","GarageCond","GarageType","GarageCars","GarageArea")])
tmp=all[is.na(all$GarageFinish) & is.na(all$GarageQual) & is.na(all$GarageCond) & is.na(all$GarageType),]
sprintf("how many of these rows have NA values in GarageFinish, Qual, Cond, and Type (after replacement): %s", dim(tmp)[1])
[1] "how many of these rows have NA values in GarageFinish, Qual, Cond, and Type (after replacement): 158"
GarageFinish: Interior finish of the garage
Fin Finished RFn Rough Finished
Unf Unfinished NA No Garage
#the NAs indicate "No Garage"
#replace NAs with "No Garage"
all[is.na(all$GarageFinish),"GarageFinish"]="No Garage"
#check the number of NAs again
sum(is.na(all$GarageFinish))
[1] 0
# convert this variable to ordinal
values=c('No Garage'=0, 'Unf'=1, 'RFn'=2, 'Fin'=3)
all$GarageFinish=as.integer(revalue(all$GarageFinish, values))
class(all$GarageFinish)
[1] "integer"
GarageQual: Garage quality
Ex Excellent Gd Good TA Typical/Average Fa Fair Po Poor NA No Garage
#the NAs indicate "No Garage"
#replace NAs with "No Garage"
all[is.na(all$GarageQual),"GarageQual"]="No Garage"
#check the number of NAs again
sum(is.na(all$GarageQual))
[1] 0
#convert to ordinal
values=c("No Garage"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$GarageQual=as.integer(revalue(all$GarageQual, values))
class(all$GarageQual)
[1] "integer"
GarageCond: Garage condition
Ex Excellent Gd Good TA Typical/Average Fa Fair Po Poor NA No Garage
#the NAs indicate "No Garage"
#replace NAs with "No Garage"
all[is.na(all$GarageCond),"GarageCond"]="No Garage"
#convert to ordinal
values=c("No Garage"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$GarageCond=as.integer(revalue(all$GarageCond, values))
class(all$GarageCond)
[1] "integer"
GarageType: Garage location
2Types More than one type of garage Attchd Attached to home Basment Basement Garage BuiltIn Built-In (Garage part of house - typically has room above garage) CarPort Car Port Detchd Detached from home NA No Garage
#the NAs indicate "No Garage"
#replace NAs with "No Garage"
all[is.na(all$GarageType),"GarageType"]="No Garage"
#convert to factor class
all$GarageType=as.factor(all$GarageType)
class(all$GarageType)
[1] "factor"
Putting all variables related to basement together:
No. of missing values in each variable:
BsmtCond: Evaluates the general condition of the basement = 82 BsmtExposure: Refers to walkout or garden level walls = 82
BsmtQual: Evaluates the height of the basement = 81 BsmtFinType2: Rating of basement finished area = 80 BsmtFinType1: Rating of basement finished area = 79 BsmtFullBath: Basement full bathrooms = 2 BsmtHalfBath: Basement half bathrooms = 2 BsmtFinSF1: Type 1 finished square feet = 1 BsmtFinSF2: Type 2 finished square feet = 1 BsmtUnfSF: Unfinished square feet of basement area = 1 TotalBsmtSF: Total square feet of basement area = 1
#check whether the 79 records with missing values in BsmtFinType1 correspond with those in BsmtCond, Exposure, Qual and FinType2
tmp=all[is.na(all$BsmtFinType1) & is.na(all$BsmtCond) & is.na(all$BsmtExposure) & is.na(all$BsmtQual) & is.na(all$BsmtFinType2),]
print(tmp)
sprintf("There are %s rows with NAs in bsmtfintype1, bsmtcond, bsmtexposure, bsmtqual and bsmtfintype2 concurrently ",dim(tmp)[1])
[1] "There are 79 rows with NAs in bsmtfintype1, bsmtcond, bsmtexposure, bsmtqual and bsmtfintype2 concurrently "
#finding the other records with NA in the basement variables.
all[!is.na(all$BsmtFinType1) & (is.na(all$BsmtCond)|is.na(all$BsmtExposure)|is.na(all$BsmtQual)|is.na(all$BsmtFinType2)),c("BsmtFinType1","BsmtCond","BsmtExposure","BsmtQual","BsmtFinType2")]
#each row is observed to have a missing value out of five variables
#we will replace these NAs with the mode.
all[333,"BsmtFinType2"]=getmode(all$BsmtFinType2)
all[949,"BsmtExposure"]=getmode(all$BsmtExposure)
all[1488,"BsmtExposure"]=getmode(all$BsmtExposure)
all[2041,"BsmtCond"]=getmode(all$BsmtCond)
all[2186,"BsmtCond"]=getmode(all$BsmtCond)
all[2218,"BsmtQual"]=getmode(all$BsmtQual)
all[2219,"BsmtQual"]=getmode(all$BsmtQual)
all[2349,"BsmtExposure"]=getmode(all$BsmtExposure)
all[2525,"BsmtCond"]=getmode(all$BsmtCond)
all[c(333,949,1488,2041,2186,2218,2219,2349,2525),c("BsmtFinType1","BsmtCond","BsmtExposure","BsmtQual","BsmtFinType2")]
BsmtCond: Evaluates the general condition of the basement
Ex Excellent Gd Good TA Typical - slight dampness allowed Fa Fair - dampness or some cracking or settling Po Poor - Severe cracking, settling, or wetness NA No Basement
#the NAs indicate "No Basement"
#replace NAs with "No Basement"
all[is.na(all$BsmtCond),"BsmtCond"]="No Basement"
#convert to ordinal
values=c("No Basement"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$BsmtCond=as.integer(revalue(all$BsmtCond, values))
The following `from` values were not present in `x`: Ex
class(all$BsmtCond)
[1] "integer"
BsmtExposure: Refers to walkout or garden level walls
Gd Good Exposure Av Average Exposure (split levels or foyers typically score average or above)
Mn Mimimum Exposure No No Exposure NA No Basement
#the NAs indicate "No Basement"
#replace NAs with "No Basement"
all[is.na(all$BsmtExposure),"BsmtExposure"]="No Basement"
#convert to ordinal
values=c("No Basement"=0,"No"=1,"Mn"=2,"Av"=3,"Gd"=4)
all$BsmtExposure=as.integer(revalue(all$BsmtExposure, values))
class(all$BsmtExposure)
[1] "integer"
BsmtQual: Evaluates the height of the basement
Ex Excellent (100+ inches) Gd Good (90-99 inches) TA Typical (80-89 inches) Fa Fair (70-79 inches) Po Poor (<70 inches NA No Basement
#the NAs indicate "No Basement"
#replace NAs with "No Basement"
all[is.na(all$BsmtQual),"BsmtQual"]="No Basement"
#convert to ordinal
values=c("No Basement"=0,"Po"=1,"Fa"=2,"TA"=3,"Gd"=4,"Ex"=5)
all$BsmtQual=as.integer(revalue(all$BsmtQual, values))
The following `from` values were not present in `x`: Po
class(all$BsmtQual)
[1] "integer"
BsmtFinType2: Rating of basement finished area (if multiple types)
GLQ Good Living Quarters ALQ Average Living Quarters BLQ Below Average Living Quarters
Rec Average Rec Room LwQ Low Quality Unf Unfinshed NA No Basement
#the NAs indicate "No Basement"
#replace NAs with "No Basement"
all[is.na(all$BsmtFinType2),"BsmtFinType2"]="No Basement"
#convert to ordinal
values=c("No Basement"=0,"Unf"=1,"LwQ"=2,"Rec"=3,"BLQ"=4,"ALQ"=5,"GLQ"=6)
all$BsmtFinType2=as.integer(revalue(all$BsmtFinType2, values))
class(all$BsmtFinType2)
[1] "integer"
BsmtFinTyp1: Rating of basement finished area
GLQ Good Living Quarters ALQ Average Living Quarters BLQ Below Average Living Quarters
Rec Average Rec Room LwQ Low Quality Unf Unfinshed NA No Basement
#the NAs indicate "No Basement"
#replace NAs with "No Basement"
all[is.na(all$BsmtFinType1),"BsmtFinType1"]="No Basement"
#convert to ordinal
values=c("No Basement"=0,"Unf"=1,"LwQ"=2,"Rec"=3,"BLQ"=4,"ALQ"=5,"GLQ"=6)
all$BsmtFinType1=as.integer(revalue(all$BsmtFinType1, values))
class(all$BsmtFinType1)
[1] "integer"
#replace NAs with zero
all[is.na(all$BsmtFullBath),"BsmtFullBath"]=0
hist(all$BsmtFullBath, main="Histogram of BsmtFullBath", xlab="number of full bathrooms in basement")
#replace NAs with zero
all[is.na(all$BsmtHalfBath),"BsmtHalfBath"]=0
hist(all$BsmtHalfBath, main="Histogram of BsmtHalfBath", xlab="number of half bathrooms in basement")
#replace NAs with zero
#they are integers
all[is.na(all$BsmtFinSF1),"BsmtFinSF1"]=0
all[is.na(all$BsmtFinSF2),"BsmtFinSF2"]=0
all[is.na(all$BsmtUnfSF),"BsmtUnfSF"]=0
all[is.na(all$TotalBsmtSF),"TotalBsmtSF"]=0
Putting related variables, MasVnrType (24 missing values) and MasVnrArea(23 missing values), to assess.
#the number of records with MasVnrType being NA given MasVnrArea is also NA.
tmp=all[is.na(all$MasVnrType) & is.na(all$MasVnrArea),]
sprintf("The number of records with with MasVnrType being NA given MasVnrArea is also NA: %s", dim(tmp)[1])
[1] "The number of records with with MasVnrType being NA given MasVnrArea is also NA: 23"
#finding the extra NA record of MasVnrType
all[is.na(all$MasVnrType) & !is.na(all$MasVnrArea), c("MasVnrType","MasVnrArea")]
#correct entry 2611
table(all$MasVnrType)
BrkCmn BrkFace None Stone
25 879 1742 249
#"BrkFace" has the highest mode after "None"
all[2611,c("MasVnrType")]="BrkFace"
print(all[2611,c("MasVnrType","MasVnrArea")])
#replace NAs with None
all[is.na(all$MasVnrType),"MasVnrType"]="None"
MasVnrType: Masonry veneer type
BrkCmn Brick Common BrkFace Brick Face CBlock Cinder Block None None Stone Stone
#check the relationship between the different category in MasVnrType and SalePrice
attach(all)
aggregate(all[,c("MasVnrType","SalePrice")],by=list(MasVnrType),FUN=mean,na.rm=TRUE)
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA
#relationship seems to exist between "MasVnrType" and "SalePrice"
#convert to ordinal
values=c("BrkCmn"=0,"None"=1,"BrkFace"=2,"Stone"=3)
all$MasVnrType=as.integer(revalue(all$MasVnrType, values))
class(all$MasVnrType)
[1] "integer"
#replace NAs with zero
all[is.na(all$MasVnrArea),"MasVnrArea"]=0
MSZoning: Identifies the general zoning classification of the sale.
A Agriculture C Commercial FV Floating Village Residential I Industrial RH Residential High Density RL Residential Low Density RP Residential Low Density Park RM Residential Medium Density
#input missing values with mode and convert to factor
all[is.na(all$MSZoning),"MSZoning"]=getmode(all$MSZoning)
all$MSZoning=as.factor(all$MSZoning)
class(all$MSZoning)
[1] "factor"
KitchenQual: Kitchen quality
Ex Excellent Gd Good TA Typical/Average Fa Fair Po Poor
#replace missing values with the mode
all[is.na(all$KitchenQual),"KitchenQual"]=getmode(all$KitchenQual)
sprintf("number of missing values in KitchenQual(after replacement):%s", sum(is.na(all$KitchenQual)))
[1] "number of missing values in KitchenQual(after replacement):0"
#convert to ordinal
values=c("Po"=0,"Fa"=1,"TA"=2,"Gd"=3,"Ex"=4)
all$KitchenQual=as.integer(revalue(all$KitchenQual, values))
The following `from` values were not present in `x`: Po
class(all$KitchenQual)
[1] "integer"
Utilities: Type of utilities available
AllPub All public Utilities (E,G,W,& S)
NoSewr Electricity, Gas, and Water (Septic Tank) NoSeWa Electricity and Gas Only ELO Electricity only
#checking the frequency of each level
table(all$Utilities)
AllPub NoSeWa
2916 1
#the table shows that most of the rows are categorised as "AllPub"
#this variable hence serve little value in predicting the response varaiable
all$Utilities=NULL
Functional: Home functionality (Assume typical unless deductions are warranted)
Typ Typical Functionality Min1 Minor Deductions 1 Min2 Minor Deductions 2 Mod Moderate Deductions Maj1 Major Deductions 1 Maj2 Major Deductions 2 Sev Severely Damaged Sal Salvage only
#replace the missing value with the mode
all[is.na(all$Functional),"Functional"]=getmode(all$Functional)
#convert to ordinal
values=c("Sal"=0,"Sev"=1,"Maj2"=2,"Maj1"=3,"Mod"=4,"Min2"=5,"Min1"=6,"Typ"=7)
all$Functional=as.integer(revalue(all$Functional, values))
The following `from` values were not present in `x`: Sal
class(all$Functional)
[1] "integer"
table(all$Functional)
1 2 3 4 5 6 7
2 9 19 35 70 65 2719
#checking the distribution of the Functional
table(all$Functional)
1 2 3 4 5 6 7
2 9 19 35 70 65 2719
Exterior1st: Exterior covering on house
AsbShng Asbestos Shingles AsphShn Asphalt Shingles BrkComm Brick Common BrkFace Brick Face CBlock Cinder Block CemntBd Cement Board HdBoard Hard Board ImStucc Imitation Stucco MetalSd Metal Siding Other Other Plywood Plywood PreCast PreCast Stone Stone Stucco Stucco VinylSd Vinyl Siding Wd Sdng Wood Siding WdShing Wood Shingles
#replace missing values with the mode
all[is.na(all$Exterior1st),"Exterior1st"]=getmode(all$Exterior1st)
#convert as factor
all$Exterior1st= as.factor(all$Exterior1st)
class(all$Exterior1st)
[1] "factor"
Exterior2nd: Exterior covering on house (if more than one material)
AsbShng Asbestos Shingles AsphShn Asphalt Shingles BrkComm Brick Common BrkFace Brick Face CBlock Cinder Block CemntBd Cement Board HdBoard Hard Board ImStucc Imitation Stucco MetalSd Metal Siding Other Other Plywood Plywood PreCast PreCast Stone Stone Stucco Stucco VinylSd Vinyl Siding Wd Sdng Wood Siding WdShing Wood Shingles
#replace missing values with the mode
all[is.na(all$Exterior2nd),"Exterior2nd"]=getmode(all$Exterior2nd)
#convert as factor
all$Exterior2nd= as.factor(all$Exterior2nd)
class(all$Exterior2nd)
[1] "factor"
33.Electrical: Electrical system
SBrkr Standard Circuit Breakers & Romex
FuseA Fuse Box over 60 AMP and all Romex wiring (Average)
FuseF 60 AMP Fuse Box and mostly Romex wiring (Fair)
FuseP 60 AMP Fuse Box and mostly knob & tube wiring (poor)
Mix Mixed
#replace missing value with mode and convert variable to factor
all[is.na(all$Electrical),"Electrical"]=getmode(all$Electrical)
all$Electrical=as.factor(all$Electrical)
class(all$Electrical)
[1] "factor"
table(all$Electrical)
FuseA FuseF FuseP Mix SBrkr
188 50 8 1 2672
SaleType: Type of sale
WD Warranty Deed - Conventional CWD Warranty Deed - Cash VWD Warranty Deed - VA Loan New Home just constructed and sold COD Court Officer Deed/Estate Con Contract 15% Down payment regular terms ConLw Contract Low Down payment and low interest ConLI Contract Low Interest ConLD Contract Low Down Oth Other
#replace missing value with mode and convert variable to factor
all[is.na(all$SaleType),"SaleType"]=getmode(all$SaleType)
all$SaleType=as.factor(all$SaleType)
class(all$SaleType)
[1] "factor"
table(all$SaleType)
COD Con ConLD ConLI ConLw CWD New Oth WD
87 5 26 9 8 12 239 7 2526
################################### Convert the remaining variables to appropriate class #######################################
Foundation: Type of foundation
BrkTil Brick & Tile
CBlock Cinder Block
PConc Poured Contrete
Slab Slab
Stone Stone
Wood Wood#converting into factor class
all$Foundation=as.factor(all$Foundation)
class(all$Foundation)
[1] "factor"
Heating: Type of heating
Floor Floor Furnace
GasA Gas forced warm air furnace
GasW Gas hot water or steam heat
Grav Gravity furnace
OthW Hot water or steam heat other than gas
Wall Wall furnace#converting into factor class
all$Heating=as.factor(all$Heating)
class(all$Heating)
[1] "factor"
HeatingQC: Heating quality and condition
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor#converting into ordinal
values=c("Po"=0,"Fa"=1,"TA"=2,"Gd"=3,"Ex"=4)
all$HeatingQC=as.integer(revalue(all$HeatingQC, values))
class(all$HeatingQC)
[1] "integer"
CentralAir: Central air conditioning
N No
Y Yes#convert to ordinal
values=c("N"=0,"Y"=1)
all$CentralAir=as.integer(revalue(all$CentralAir, values))
class(all$CentralAir)
[1] "integer"
RoofStyle: Type of roof
Flat Flat
Gable Gable
Gambrel Gabrel (Barn)
Hip Hip
Mansard Mansard
Shed Shed#converting into factor class
all$RoofStyle=as.factor(all$RoofStyle)
class(all$RoofStyle)
[1] "factor"
f.RoofMatl: Roof material
ClyTile Clay or Tile
CompShg Standard (Composite) Shingle
Membran Membrane
Metal Metal
Roll Roll
Tar&Grv Gravel & Tar
WdShake Wood Shakes
WdShngl Wood Shingles
#converting into factor class
all$RoofMatl=as.factor(all$RoofMatl)
class(all$RoofMatl)
[1] "factor"
LandContour: Flatness of the property
Lvl Near Flat/Level
Bnk Banked - Quick and significant rise from street grade to building
HLS Hillside - Significant slope from side to side
Low Depression#converting into factor class
all$LandContour=as.factor(all$LandContour)
class(all$LandContour)
[1] "factor"
h.LandSlope: Slope of property
Gtl Gentle slope
Mod Moderate Slope
Sev Severe Slope
#converting to ordinal
values=c("Sev"=0,"Mod"=1,"Gtl"=2)
all$LandSlope=as.integer(revalue(all$LandSlope, values))
class(all$LandSlope)
[1] "integer"
BldgType: Type of dwelling
1Fam Single-family Detached
2FmCon Two-family Conversion; originally built as one-family dwelling
Duplx Duplex
TwnhsE Townhouse End Unit
TwnhsI Townhouse Inside Unit#convert to factor
all$BldgType=as.factor(all$BldgType)
class(all$BldgType)
[1] "factor"
HouseStyle: Style of dwelling
1Story One story
1.5Fin One and one-half story: 2nd level finished
1.5Unf One and one-half story: 2nd level unfinished
2Story Two story
2.5Fin Two and one-half story: 2nd level finished
2.5Unf Two and one-half story: 2nd level unfinished
SFoyer Split Foyer
SLvl Split Level#convert to factor
all$HouseStyle=as.factor(all$HouseStyle)
class(all$HouseStyle)
[1] "factor"
Neighborhood: Physical locations within Ames city limits
Blmngtn Bloomington Heights
Blueste Bluestem
BrDale Briardale
BrkSide Brookside
ClearCr Clear Creek
CollgCr College Creek
Crawfor Crawford
Edwards Edwards
Gilbert Gilbert
IDOTRR Iowa DOT and Rail Road
MeadowV Meadow Village
Mitchel Mitchell
Names North Ames
NoRidge Northridge
NPkVill Northpark Villa
NridgHt Northridge Heights
NWAmes Northwest Ames
OldTown Old Town
SWISU South & West of Iowa State University
Sawyer Sawyer
SawyerW Sawyer West
Somerst Somerset
StoneBr Stone Brook
Timber Timberland
Veenker Veenker#convert to factor
all$Neighborhood=as.factor(all$Neighborhood)
class(all$Neighborhood)
[1] "factor"
Condition1: Proximity to various conditions
Artery Adjacent to arterial street
Feedr Adjacent to feeder street
Norm Normal
RRNn Within 200' of North-South Railroad
RRAn Adjacent to North-South Railroad
PosN Near positive off-site feature--park, greenbelt, etc.
PosA Adjacent to postive off-site feature
RRNe Within 200' of East-West Railroad
RRAe Adjacent to East-West Railroad#convert to factor
all$Condition1=as.factor(all$Condition1)
class(all$Condition1)
[1] "factor"
Condition2: Proximity to various conditions (if more than one is present)
Artery Adjacent to arterial street
Feedr Adjacent to feeder street
Norm Normal
RRNn Within 200' of North-South Railroad
RRAn Adjacent to North-South Railroad
PosN Near positive off-site feature--park, greenbelt, etc.
PosA Adjacent to postive off-site feature
RRNe Within 200' of East-West Railroad
RRAe Adjacent to East-West Railroad#convert to factor
all$Condition2=as.factor(all$Condition2)
class(all$Condition2)
[1] "factor"
Street: Type of road access to property
Grvl Gravel
Pave Paved#convert to ordinal
values=c("Grvl"=0,"Pave"=1)
all$Street=as.integer(revalue(all$Street, values))
class(all$Street)
[1] "integer"
PavedDrive: Paved driveway
Y Paved
P Partial Pavement
N Dirt/Gravel#convert to ordinal
values=c("N"=0,"P"=1,"Y"=2)
all$PavedDrive=as.integer(revalue(all$PavedDrive, values))
class(all$PavedDrive)
[1] "integer"
ExterQual: Evaluates the quality of the material on the exterior
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor#convert to ordinal
values=c("Po"=0,"Fa"=1,"TA"=2,"Gd"=3,"Ex"=4)
all$ExterQual=as.integer(revalue(all$ExterQual, values))
The following `from` values were not present in `x`: Po
class(all$ExterQual)
[1] "integer"
#check the distribution of the ordinal variable
table(all$ExterQual)
1 2 3 4
35 1798 979 107
ExterCond: Evaluates the present condition of the material on the exterior
Ex Excellent
Gd Good
TA Average/Typical
Fa Fair
Po Poor#convert to ordinal
values=c("Po"=0,"Fa"=1,"TA"=2,"Gd"=3,"Ex"=4)
all$ExterCond=as.integer(revalue(all$ExterCond, values))
class(all$ExterCond)
[1] "integer"
#check the distribution of the ordinal variable
table(all$ExterCond)
0 1 2 3 4
3 67 2538 299 12
YrSold: Year Sold (YYYY)
MoSold: Month Sold (MM)
#the prices of houses should be cheaper in earlier years.
#convert YrSold to ordinal
sprintf("Is data type of YrSold integer: %s", is.integer(YrSold))
[1] "Is data type of YrSold integer: TRUE"
#convert to factor
sprintf("Is data type of MoSold integer: %s", is.integer(MoSold))
[1] "Is data type of MoSold integer: TRUE"
all$MoSold=as.factor(all$MoSold)
class(all$MoSold)
[1] "factor"
MSSubClass: Identifies the type of dwelling involved in the sale.
20 1-STORY 1946 & NEWER ALL STYLES
30 1-STORY 1945 & OLDER
40 1-STORY W/FINISHED ATTIC ALL AGES
45 1-1/2 STORY - UNFINISHED ALL AGES
50 1-1/2 STORY FINISHED ALL AGES
60 2-STORY 1946 & NEWER
70 2-STORY 1945 & OLDER
75 2-1/2 STORY ALL AGES
80 SPLIT OR MULTI-LEVEL
85 SPLIT FOYER
90 DUPLEX - ALL STYLES AND AGES
120 1-STORY PUD (Planned Unit Development) - 1946 & NEWER
150 1-1/2 STORY PUD - ALL AGES
160 2-STORY PUD - 1946 & NEWER
180 PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
190 2 FAMILY CONVERSION - ALL STYLES AND AGES#convert to factor
all$MSSubClass=as.factor(all$MSSubClass)
str(all$MSSubClass)
Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 5 1 6 5 16 ...
LotShape: General shape of property
Reg Regular
IR1 Slightly irregular
IR2 Moderately Irregular
IR3 Irregular#convert LotShape as factor
values=c('IR3'=0, 'IR2'=1, 'IR1'=2, 'Reg'=3)
all$LotShape=as.integer(revalue(all$LotShape,values))
class(all$LotShape)
[1] "integer"
#check the distribution of the data
table(all$LotShape)
0 1 2 3
16 76 968 1859
LotConfig: Lot configuration
Inside Inside lot
Corner Corner lot
CulDSac Cul-de-sac
FR2 Frontage on 2 sides of property
FR3 Frontage on 3 sides of property#convert LotConfig as factor
all$LotConfig=as.factor(all$LotConfig)
class(all$LotConfig)
[1] "factor"
w.SaleCondition: Condition of sale
Normal Normal Sale
Abnorml Abnormal Sale - trade, foreclosure, short sale
AdjLand Adjoining Land Purchase
Alloca Allocation - two linked properties with separate deeds, typically condo with a garage unit
Family Sale between family members
Partial Home was not completed when last assessed (associated with New Homes)
#convert as factor
all$SaleCondition=as.factor(all$SaleCondition)
class(all$SaleCondition)
[1] "factor"
################################# Check the number of missing data and structure of the data #####################################
#tabulate the number of missing values
miss_freq=sort(colSums(is.na(all)),decreasing=TRUE)
print(miss_freq)
SalePrice MSSubClass MSZoning LotFrontage LotArea Street Alley
1459 0 0 0 0 0 0
LotShape LandContour LotConfig LandSlope Neighborhood Condition1 Condition2
0 0 0 0 0 0 0
BldgType HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
0 0 0 0 0 0 0
RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
0 0 0 0 0 0 0
Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
0 0 0 0 0 0 0
BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
0 0 0 0 0 0 0
X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
0 0 0 0 0 0 0
HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
0 0 0 0 0 0 0
FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
0 0 0 0 0 0 0
GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
0 0 0 0 0 0 0
PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
0 0 0 0 0 0 0
SaleType SaleCondition
0 0
sprintf("There are %s columns of missing values", sum(miss_freq>0))
[1] "There are 1 columns of missing values"
#check the structure of the data
str(all)
'data.frame': 2919 obs. of 79 variables:
$ MSSubClass : Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 5 1 6 5 16 ...
$ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
$ LotFrontage : int 65 80 68 60 84 85 75 81 51 50 ...
$ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
$ Street : int 1 1 1 1 1 1 1 1 1 1 ...
$ Alley : Factor w/ 3 levels "Grvl","No alley access",..: 2 2 2 2 2 2 2 2 2 2 ...
$ LotShape : int 3 3 2 2 2 2 3 2 3 3 ...
$ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
$ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 3 5 1 3 5 5 1 5 1 ...
$ LandSlope : int 2 2 2 2 2 2 2 2 2 2 ...
$ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
$ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 2 3 3 3 3 3 5 1 1 ...
$ Condition2 : Factor w/ 8 levels "Artery","Feedr",..: 3 3 3 3 3 3 3 3 3 1 ...
$ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
$ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
$ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
$ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
$ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
$ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
$ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 2 2 2 2 ...
$ RoofMatl : Factor w/ 8 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Exterior1st : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 9 13 14 13 13 13 7 4 9 ...
$ Exterior2nd : Factor w/ 16 levels "AsbShng","AsphShn",..: 14 9 14 16 14 14 14 7 16 9 ...
$ MasVnrType : int 2 1 2 1 2 1 3 3 1 1 ...
$ MasVnrArea : num 196 0 162 0 350 0 186 240 0 0 ...
$ ExterQual : int 3 2 3 2 3 2 3 2 2 2 ...
$ ExterCond : int 2 2 2 2 2 2 2 2 2 2 ...
$ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 2 3 1 3 6 3 2 1 1 ...
$ BsmtQual : int 4 4 4 3 4 4 5 4 3 3 ...
$ BsmtCond : int 3 3 3 4 3 3 3 3 3 3 ...
$ BsmtExposure : int 1 4 2 1 3 1 3 2 1 1 ...
$ BsmtFinType1 : int 6 5 6 5 6 6 6 5 1 6 ...
$ BsmtFinSF1 : num 706 978 486 216 655 ...
$ BsmtFinType2 : int 1 1 1 1 1 1 1 4 1 1 ...
$ BsmtFinSF2 : num 0 0 0 0 0 0 0 32 0 0 ...
$ BsmtUnfSF : num 150 284 434 540 490 64 317 216 952 140 ...
$ TotalBsmtSF : num 856 1262 920 756 1145 ...
$ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
$ HeatingQC : int 4 4 4 3 4 4 4 4 3 4 ...
$ CentralAir : int 1 1 1 1 1 1 1 1 1 1 ...
$ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
$ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
$ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
$ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
$ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
$ BsmtFullBath : num 1 0 1 1 1 1 1 1 0 1 ...
$ BsmtHalfBath : num 0 1 0 0 0 0 0 0 0 0 ...
$ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
$ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
$ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
$ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
$ KitchenQual : int 3 2 3 3 3 2 3 2 2 2 ...
$ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
$ Functional : int 7 7 7 7 7 7 7 7 6 7 ...
$ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
$ FireplaceQu : int 0 3 3 4 3 0 4 3 3 3 ...
$ GarageType : Factor w/ 7 levels "2Types","Attchd",..: 2 2 2 6 2 2 2 2 6 2 ...
$ GarageYrBlt : int 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
$ GarageFinish : int 2 2 2 1 2 1 2 2 1 2 ...
$ GarageCars : num 2 2 2 3 3 2 2 2 2 1 ...
$ GarageArea : num 548 460 608 642 836 480 636 484 468 205 ...
$ GarageQual : int 3 3 3 3 3 3 3 3 2 4 ...
$ GarageCond : int 3 3 3 3 3 3 3 3 3 3 ...
$ PavedDrive : int 2 2 2 2 2 2 2 2 2 2 ...
$ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
$ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
$ EnclosedPorch: int 0 0 0 272 0 0 0 228 205 0 ...
$ X3SsnPorch : int 0 0 0 0 0 320 0 0 0 0 ...
$ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
$ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
$ PoolQC : num 0 0 0 0 0 0 0 0 0 0 ...
$ Fence : Factor w/ 5 levels "GdPrv","GdWo",..: 5 5 5 5 5 3 5 5 5 5 ...
$ MiscFeature : Factor w/ 5 levels "Gar2","None",..: 2 2 2 2 2 4 2 4 2 2 ...
$ MiscVal : int 0 0 0 0 0 700 0 350 0 0 ...
$ MoSold : Factor w/ 12 levels "1","2","3","4",..: 2 5 9 2 12 10 8 11 4 1 ...
$ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
$ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
$ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
$ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
#save dataframe with its formatting.
saveRDS(all,file="all_DP.rds")